Objective

  1. Predict the capacity of wind turbines in Canada based on turbine features using xgboost
  2. Interpret model using SHAP values

Get Data

a = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   objectid = col_double(),
##   province_territory = col_character(),
##   project_name = col_character(),
##   total_project_capacity_mw = col_double(),
##   turbine_identifier = col_character(),
##   turbine_number_in_project = col_character(),
##   turbine_rated_capacity_k_w = col_double(),
##   rotor_diameter_m = col_double(),
##   hub_height_m = col_double(),
##   manufacturer = col_character(),
##   model = col_character(),
##   commissioning_date = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   notes = col_character()
## )
#abak = a
a = readRDS('a.rds') %>% 
  clean_names() %>% 
  mutate(across(where(is.character),factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  ) %>% slice_sample(prop = 0.333) #for speed, ony using 1/3 of orig dataset

a %>% sample_n(5)
#remove unnecessary cols

a = a %>% select(
    -notes,
    -turbine_identifier,
    -turbine_number_in_project,
    -objectid,
    -total_project_capacity_mw,
    #removing the vars below b/c it would be unrealistic to use them for predictions
    #we likely wouldn't have such information with new data
    -model, 
    -latitude,
    -longitude,
    -project_name
)
a %>% sample_n(5)

check zeroes, miss, inf

a %>% vis_dat()

a %>% vis_miss()

a %>% status() %>% arrange(-p_na)
#Since there are so few rows with missin data, will simply remove missing rows, and not bother with imputation
a = a %>% na.omit()

Express EDA

#normally I would do more thorough EDA, but the purpose of this notebook is to practice creating and interpreting XGBoost Models
library(inspectdf)
## Warning: package 'inspectdf' was built under R version 4.0.3
a %>% inspect_num()
a %>% inspect_cat() 
a %>% inspect_cat() %>% .$levels
## $commissioning_date
## # A tibble: 31 x 3
##    value   prop   cnt
##    <chr>  <dbl> <int>
##  1 2014  0.119    257
##  2 2013  0.104    224
##  3 2011  0.0868   187
##  4 2015  0.0789   170
##  5 2009  0.0771   166
##  6 2006  0.0720   155
##  7 2012  0.0548   118
##  8 2010  0.0460    99
##  9 2016  0.0404    87
## 10 2008  0.0306    66
## # ... with 21 more rows
## 
## $manufacturer
## # A tibble: 21 x 3
##    value                       prop   cnt
##    <chr>                      <dbl> <int>
##  1 Vestas                   0.287     619
##  2 GE                       0.273     588
##  3 Enercon                  0.149     322
##  4 Siemens                  0.143     307
##  5 Senvion                  0.105     227
##  6 NEG Micon                0.0209     45
##  7 Acciona                  0.00418     9
##  8 Acciona Wind Power       0.00418     9
##  9 Nordex                   0.00418     9
## 10 Samsung Renewable Energy 0.00186     4
## # ... with 11 more rows
## 
## $province_territory
## # A tibble: 11 x 3
##    value                         prop   cnt
##    <chr>                        <dbl> <int>
##  1 Ontario                   0.371      800
##  2 Quebec                    0.317      682
##  3 Alberta                   0.141      304
##  4 British Columbia          0.0478     103
##  5 Nova Scotia               0.0404      87
##  6 Saskatchewan              0.0232      50
##  7 New Brunswick             0.0200      43
##  8 Prince Edward Island      0.0186      40
##  9 Manitoba                  0.0153      33
## 10 Newfoundland and Labrador 0.00511     11
## 11 Northwest Territories     0.000464     1
a %>% inspect_cor()
a %>% inspect_types()
library(WVPlots)
## Warning: package 'WVPlots' was built under R version 4.0.3
#https://cran.r-project.org/web/packages/WVPlots/vignettes/WVPlots_examples.html
a %>% ScatterHist('rotor_diameter_m','turbine_rated_capacity_k_w', smoothmethod = 'lm','rotor diameter vs turbine output')
## Warning: Removed 3 rows containing missing values (geom_smooth).

a %>% ScatterHist('hub_height_m','turbine_rated_capacity_k_w', smoothmethod = 'lm','hub height vs turbine output')

library(ggpointdensity)
## Warning: package 'ggpointdensity' was built under R version 4.0.3
ggplotly(a %>% ggplot(aes(rotor_diameter_m, turbine_rated_capacity_k_w)) + geom_pointdensity(size = 1.2) + scale_color_viridis_c() + ggtitle('Rotor Diameter vs Turbine Output'))
ggplotly(a %>% ggplot(aes(hub_height_m, turbine_rated_capacity_k_w)) + geom_pointdensity(size = 1.2) + scale_color_viridis_c() + ggtitle('Hub Height vs Turbine Output'))

1) split data

split = a %>% initial_split(strata = province_territory)
train = split %>% training
test = split %>% testing
vfold = train %>% vfold_cv

2) spec rec, mdl, wf

rec = train %>% recipe(turbine_rated_capacity_k_w ~ . ) %>% 
  step_zv(all_numeric(), - all_outcomes()) %>% 
  step_nzv(all_numeric(), - all_outcomes()) %>% 
  step_corr(all_numeric(), - all_outcomes()) %>%
  #----------------------------
  step_other(manufacturer, - all_outcomes()) %>% 
  step_normalize(all_numeric(), - all_outcomes()) %>% 
  step_dummy(all_nominal(), - all_outcomes(), one_hot = TRUE)

baked.train = rec %>% prep %>% bake(new_data = NULL)
baked.test = rec %>% prep %>% bake(new_data = test)

#---------------------------------------------------

mdl = parsnip::boost_tree(
  trees = tune(),
  min_n = tune(),
  tree_depth = tune(),
  learn_rate = tune(),
  loss_reduction = tune(),
  sample_size = tune()
  ) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

#---------------------------------------------------

wf = workflow() %>% 
  add_recipe(rec) %>% 
  add_model(mdl)

3) execute tunegrid, finalize wf

set.seed(345)
doParallel::registerDoParallel()

tg = tune_grid(
  object = wf,
  resamples = vfold,
  grid = 10
)

#---------------------------------------------------

wf = wf %>% finalize_workflow(tg %>% select_best())
## Warning: No value of `metric` was given; metric 'rmse' will be used.
beep(5)

4) using wf, fit train, finalize mdl

(mdl = wf %>% fit(train) %>% pull_workflow_fit())
## parsnip model object
## 
## Fit time:  5.1s 
## ##### xgb.Booster
## raw: 1 Mb 
## call:
##   xgboost::xgb.train(params = list(eta = 0.0132231849539539, max_depth = 12L, 
##     gamma = 0.0000000985063220750636, colsample_bytree = 1, min_child_weight = 19L, 
##     subsample = 0.669344357615337), data = x$data, nrounds = 598L, 
##     watchlist = x$watchlist, verbose = 0, objective = "reg:squarederror", 
##     nthread = 1)
## params (as set within xgb.train):
##   eta = "0.0132231849539539", max_depth = "12", gamma = "0.0000000985063220750636", colsample_bytree = "1", min_child_weight = "19", subsample = "0.669344357615337", objective = "reg:squarederror", nthread = "1", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.evaluation.log()
## # of features: 55 
## niter: 598
## nfeatures : 55 
## evaluation_log:
##     iter training_rmse
##        1          2034
##        2          2008
## ---                   
##      597            64
##      598            64

5) using wf, fit train, evaluate test, return results

kekka = wf %>% last_fit(split)
kekka %>% collect_metrics()
kekka %>% collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  metric_set(rmse, mae, mape, rsq)(estimate = .pred, truth = turbine_rated_capacity_k_w)
ggplotly(
kekka %>% collect_predictions() %>%
  select(.pred, turbine_rated_capacity_k_w) %>% 
  ggplot(aes(.pred, turbine_rated_capacity_k_w)) +
  geom_abline(slope = 1, linetype = 'dotted', color = 'tomato3') +
  geom_point(alpha = 0.7, size = 1.5, color = 'slateblue3') +
  labs(
    title = 'Turbine Capacity (KW) Predictions vs Actuals', x = 'Predicted', y = 'Actuals'
  )
)

#SHAPforxgboost

0) create xgboost model

X_train = baked.train %>% select(-turbine_rated_capacity_k_w) %>% as.matrix
y = baked.train$turbine_rated_capacity_k_w

(best.hps = tg %>% select_best())
## Warning: No value of `metric` was given; metric 'rmse' will be used.
#https://parsnip.tidymodels.org/reference/boost_tree.html#see-also

library(xgboost)
## Warning: package 'xgboost' was built under R version 4.0.3
mdl.xgb = xgboost(
  data = X_train,
  label = y,
  max.depth = best.hps$tree_depth,
  nrounds = best.hps$trees,
  eta = best.hps$learn_rate,
  min.child.weight = best.hps$min_n,
  gamma = best.hps$loss_reduction,
  subsample = best.hps$sample_size,
  #----------------------------
  nthread = parallel::detectCores() - 2,
  verbose = FALSE,
  objective = 'reg:squarederror'
)

1) Individual Observation Top/Bottom SHAP shap.values

shap.vals = shap.values(
  mdl.xgb, X_train
)

#----------------------------individual observation's SHAP values 
observation = 88

shap.vals$shap_score %>% dplyr::slice(observation)
shap.vals$shap_score %>% dplyr::slice(observation) %>%
  pivot_longer(everything()) %>% 
  rename('feature' = 'name', 'SHAP' = 'value') %>% 
  filter(SHAP > 1 | SHAP < -1) %>%
  mutate(
    feature = fct_reorder(feature, SHAP),
    SHAP = round(SHAP, 1)
    ) %>% 
  plot_ly(x = ~SHAP, y = ~feature, color = ~if_else(SHAP < 0, I('darkred'), I('darkgreen'))) %>% 
  layout(
    title = paste0('Local: Significant SHAP Values for Observation ', observation),
    xaxis = list(title = paste0(
      'Cumulative Total SHAP Values: ',
      round(shap.vals$shap_score %>% dplyr::slice(observation) %>% rowSums(), 2)
      )),
    yaxis = list(title = ''))
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#bar
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used
## Warning: `error_y.color` does not currently support multiple values.
## Warning: `error_x.color` does not currently support multiple values.

2) Aggregate Model Top N SHAP shap.values

#----------------------------top 10 Features avg SHAP values

n = 10

tibble(
features = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n) %>% names,
SHAP = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n)
) %>% mutate(features = fct_reorder(features, SHAP)) %>% 
  plot_ly(x = ~SHAP, y = ~features) %>% 
  layout(
    title = paste0('Global: Average SHAP for Top ', n ,' Features'),
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#bar

3) Aggregate Model Top N SHAP shap.plot.summary.wrap1

#https://www.rdocumentation.org/packages/SHAPforxgboost/versions/0.0.4/topics/shap.plot.summary.wrap1

n = 10

ggplotly(
shap.plot.summary.wrap1(
  mdl.xgb, X_train,
  top_n = n
  ) + labs(
    title = paste0('Global: Distribution of SHAP for Top ', n ,' Features')
  )
)

checking if results make sense

ggplotly(
baked.train %>%
  mutate(manufacturer_GE = if_else(manufacturer_GE == 1, 'GE','Other')) %>% 
  ggplot(aes(manufacturer_GE, turbine_rated_capacity_k_w, group = manufacturer_GE)) +
  geom_boxplot(aes(fill = manufacturer_GE), alpha = 0.50) +
  labs(
    title = 'Distribution: GE turbine capacity (KW) vs Other Manufacturers',
    y = '', x = ''
  ) + scale_fill_manual(values = c('dodgerblue3','grey60')) +
  theme(legend.position = 'none')
)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#----------------------------

ggplotly(
baked.train %>%
  mutate(manufacturer_Enercon = if_else(manufacturer_Enercon == 1, 'Enercon','Other')) %>% 
  ggplot(aes(manufacturer_Enercon, turbine_rated_capacity_k_w, group = manufacturer_Enercon)) +
  geom_boxplot(aes(fill = manufacturer_Enercon), alpha = 0.50) +
  labs(
    title = 'Distribution: Enercon turbine capacity (KW) vs Other Manufacturers',
    y = '', x = ''
  ) + scale_fill_manual(values = c('seagreen4','grey60')) +
  theme(legend.position = 'none')
)

4) Dependence Plots shap.prep & shap.plot.dependence

n = 5

top.n.features = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n) %>% names

X_train.long = shap.prep(xgb_model = mdl.xgb, X_train = X_train)

map(
  .x = top.n.features,
  ~shap.plot.dependence(
    data_long = X_train.long,
    x = .x,
    size0 = 1.5,
    add_hist = TRUE
  )
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 8.1421e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 8.1421e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4265e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4265e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01

## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]

5) Interaction Plots shap.plot.dependence

#The SHAP interaction values take time since it calculates all the combinations.

shap.plot.dependence(
  data_long = X_train.long,
  data_int = shap.prep.interaction(
    xgb_model = mdl.xgb,
    X_train = X_train
    ),
  x = 'rotor_diameter_m',
  y = 'hub_height_m'
)
## `geom_smooth()` using formula 'y ~ x'

Force Plots

stack.data = shap.prep.stack.data(
  shap_contrib = shap.vals$shap_score,
  top_n = 3,
  n_groups = 3
  )
## The SHAP values of the Rest 52 features were summed into variable 'rest_variables'.
shap.plot.force_plot(
  stack.data
  #zoom_in_location = ,
  #y_parent_limit = c(- , +)
)
## Data has N = 1617 | zoom in length is 150 at location 970.2.

shap.plot.force_plot_bygroup(stack.data)